MID-TERM EXAM

Import

In [137]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from datetime import datetime, timedelta
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn import mixture
from sklearn.neighbors import KernelDensity
import plotly.io as pio
import warnings
warnings.filterwarnings('ignore')
pio.renderers.default = "notebook"

Data Exploration

In [138]:
data = pd.read_csv('data.csv', encoding="ISO-8859-1")
data.head()
Out[138]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom

Removing inconsistencies

In [139]:
# Remove inconsistent data 
data = data.drop((data[data['Quantity']<=0]).index)
data = data.drop((data[data['UnitPrice']<=0]).index)

# Get total purchases per date
data['Total'] = data.Quantity * data.UnitPrice

# Change to datetime
data.InvoiceDate = pd.to_datetime(data.InvoiceDate)
unique_dates = (data.InvoiceDate.dt.date).unique()

# Create date column
data['Date'] = data['InvoiceDate'].dt.date

# Create month column
data['Month'] = data['InvoiceDate'].dt.to_period('M')

data.head()
Out[139]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country Total Date Month
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom 15.30 2010-12-01 2010-12
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 20.34 2010-12-01 2010-12
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom 22.00 2010-12-01 2010-12
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 20.34 2010-12-01 2010-12
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 20.34 2010-12-01 2010-12

Exploring Countries

In [140]:
# Counts sales per countries
counts_countries = data.groupby('Country')['Total'].sum()

# Plot
fig = go.Figure([go.Bar(x=counts_countries.keys().to_list(), y=counts_countries.to_list())])
fig.update_layout(barmode='group')
fig.show()

Exploring Products

In [141]:
# Counts sales per countries
counts_products = data.groupby('StockCode')['InvoiceNo'].nunique()
list_counts = np.array(counts_products.to_list())
list_products = np.array(counts_products.keys().to_list())
sort_index = np.argsort(list_counts)

# Plot
fig = go.Figure([go.Table(header=dict(values=['Product', 'Number of Invoices']),
                          cells=dict(values=[list_products[sort_index], list_counts[sort_index]]))
                ])
fig.show()

Exploring Customers

In [142]:
# Counts sales per client
counts_clients = data.groupby('CustomerID')['InvoiceNo'].count()
list_counts = np.array(counts_clients.to_list())
list_clients = np.array(counts_clients.keys().to_list())
sort_index = np.argsort(list_counts)

# Plot
fig = go.Figure([go.Table(header=dict(values=['Client ID', 'Number of Invoices']),
                          cells=dict(values=[list_clients[sort_index], list_counts[sort_index]]))
                ])
fig.show()

Get total sales per Month

In [143]:
# Get totals per month
unique_dates = data['Month'].dt.start_time.dt.date.unique()
total = data.groupby('Month')['Total'].sum()

# Scatter Data
data_fig = go.Scatter(x=unique_dates, y=total, name="Sales", line_color='deepskyblue')

# Layout
layout = go.Layout(
    title=go.layout.Title(
        text="<b>Total Sales</b>",
        xref='paper',
        x=0.5,
        y = 0.9
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='<b>Time</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='<b>Sales</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    )
)

# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

Bayesian Regression Sales per Month

In [144]:
num_days_predict = 360
num_sigma = 2
k_fold = 10

# Get Dates an origin
origin = unique_dates[0].toordinal()

# Training Data
X = np.array([datetime.toordinal(date) - origin for date in unique_dates])
X = X.reshape(X.shape + (1,))
y = total

# Test Data
X_test = np.array([i for i in range(X[-1][0], X[-1][0] + num_days_predict)])
X_test = X_test.reshape(X_test.shape + (1,))
X_test_dates = np.array([datetime.fromordinal(x[0]+origin) for x in X_test])

# Model Selection
grid = GridSearchCV(BayesianRidge(tol=0.0001, fit_intercept=True, compute_score=True),
                    {}, 
                    cv=LeaveOneOut(),
                    n_jobs=-1)

# Train and predict
grid.fit(X, y)
reg = grid.best_estimator_
y_mean_train, y_stdv_train = reg.predict(X, return_std=True)
y_mean, y_stdv = reg.predict(X_test, return_std=True)

# Scatter Data
source = go.Scatter(x=unique_dates, 
                    y=total, name="Sales", 
                    mode='lines+markers',
                    line_color='deepskyblue',
                    line={'dash': 'solid'})

predicted_up = go.Scatter(x=unique_dates, 
                          y=y_mean_train+num_sigma*y_stdv_train, 
                          name="Predicted Upper", 
                          mode='lines',
                          line_color='red', 
                          line={'dash': 'dash'}, 
                          fill='tonexty',
                          fillcolor='rgba(255, 0, 0, 0.1)')
predicted_low = go.Scatter(x=unique_dates, 
                           y=y_mean_train-num_sigma*y_stdv_train, 
                           name="Predicted Lower", 
                           mode='lines',
                           line_color='red', 
                           line={'dash': 'dash'})
predicted_mean = go.Scatter(x=unique_dates, 
                           y=y_mean_train, 
                           name="Predicted Mean",
                           mode='lines',
                           line_color='red', 
                           line={'dash': 'solid'})

forecast_up = go.Scatter(x=X_test_dates, 
                         y=y_mean+num_sigma*y_stdv, 
                         name="Forecast Upper", 
                         line_color='green', 
                         line={'dash': 'dash'})
forecast_low = go.Scatter(x=X_test_dates, 
                          y=y_mean-num_sigma*y_stdv, 
                          name="Forecast Lower", 
                          line_color='green', 
                          line={'dash': 'dash'}, 
                          fill='tonexty',
                          fillcolor='rgba(0, 255, 0, 0.1)')
forecast_mean = go.Scatter(x=X_test_dates, 
                           y=y_mean, 
                           name="Forecast Mean", 
                           line_color='green', 
                           line={'dash': 'solid'})

data_fig = [source, predicted_low, predicted_up, forecast_up, forecast_low, predicted_mean, forecast_mean]

# Layout
layout = go.Layout(
    title=go.layout.Title(
        text="<b>Total Sales</b>",
        xref='paper',
        x=0.5,
        y = 0.9
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='<b>Time</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='<b>Sales</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    )
)

# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

Get demand per Month

In [145]:
product_id = '85123A'

# Get totals per month
product = data.loc[data['StockCode']==product_id]
unique_dates = data['Month'].dt.start_time.dt.date.unique()
total_product = product.groupby('Month')['Quantity'].sum()

# Scatter Data
data_fig = go.Scatter(x=unique_dates, y=total_product, name="Demand {}".format(product_id), line_color='deepskyblue')

# Layout
layout = go.Layout(
    title=go.layout.Title(
        text="<b>Demand {}</b>".format(product_id),
        xref='paper',
        x=0.5,
        y = 0.9
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='<b>Time</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='<b>Demand</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    )
)

# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

Demand per Month of Top Product

In [146]:
num_sigma = 2
num_days_predict = 360

# Training Data
origin = unique_dates[0].toordinal()
X = np.array([datetime.toordinal(date) - origin for date in unique_dates])
X = X.reshape(X.shape + (1,))
y = total_product

# Test Data
X_test = np.array([i for i in range(X[-1][0], X[-1][0] + num_days_predict)])
X_test = X_test.reshape(X_test.shape + (1,))
X_test_dates = np.array([datetime.fromordinal(x[0]+origin) for x in X_test])

reg = BayesianRidge(tol=1e-6, fit_intercept=True, compute_score=True)
reg.fit(X, y)

y_mean_train, y_stdv_train = reg.predict(X, return_std=True)
y_mean, y_stdv = reg.predict(X_test, return_std=True)

# Scatter Data
source = go.Scatter(x=unique_dates, 
                    y=total_product, name="Sales", 
                    mode='lines+markers',
                    line_color='deepskyblue',
                    line={'dash': 'solid'})

predicted_up = go.Scatter(x=unique_dates, 
                          y=y_mean_train+num_sigma*y_stdv_train, 
                          name="Predicted Upper", 
                          mode='lines',
                          line_color='red', 
                          line={'dash': 'dash'}, 
                          fill='tonexty',
                          fillcolor='rgba(255, 0, 0, 0.1)')
predicted_low = go.Scatter(x=unique_dates, 
                           y=y_mean_train-num_sigma*y_stdv_train, 
                           name="Predicted Lower", 
                           mode='lines',
                           line_color='red', 
                           line={'dash': 'dash'})
predicted_mean = go.Scatter(x=unique_dates, 
                           y=y_mean_train, 
                           name="Predicted Mean",
                           mode='lines',
                           line_color='red', 
                           line={'dash': 'solid'})

forecast_up = go.Scatter(x=X_test_dates, 
                         y=y_mean+num_sigma*y_stdv, 
                         name="Forecast Upper", 
                         line_color='green', 
                         line={'dash': 'dash'})
forecast_low = go.Scatter(x=X_test_dates, 
                          y=y_mean-num_sigma*y_stdv, 
                          name="Forecast Lower", 
                          line_color='green', 
                          line={'dash': 'dash'}, 
                          fill='tonexty',
                          fillcolor='rgba(0, 255, 0, 0.1)')
forecast_mean = go.Scatter(x=X_test_dates, 
                           y=y_mean, 
                           name="Forecast Mean", 
                           line_color='green', 
                           line={'dash': 'solid'})

data_fig = [source, predicted_low, predicted_up, forecast_up, forecast_low, predicted_mean, forecast_mean]

# Layout
layout = go.Layout(
    title=go.layout.Title(
        text="<b>Demand for Most Demanded Product</b>",
        xref='paper',
        x=0.5,
        y = 0.9
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='<b>Time</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='<b>Demand</b>',
            font=dict(
                family='Courier New, monospace',
                size=18,
                color='#000000'
            )
        )
    )
)

# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

Histograms for top 3 clients

In [147]:
# Top three customers
customer_ids = [17841, 14911, 14096]
data_fig = []

# Dictionary of deltas
dt_dict = {}
for c in customer_ids:
    dt_dict[c] = []

unique_dates = {}
for i, customer_id in enumerate(customer_ids):
    
    # Get customer dates
    customer = data.loc[data['CustomerID']==customer_id]
    unique_dates[customer_id] = (customer.InvoiceDate.dt.date).unique()

    # Get dates differences
    dt_days = [(unique_dates[customer_id][i+1] - unique_dates[customer_id][i]).days for i in range(len(unique_dates[customer_id])-1)]
    dt_dict[customer_id] = np.array(dt_days)

    # Histograms
    call_frequency = go.Histogram(x=dt_days, 
                                  name='Customer {}'.format(i+1),
                                 opacity=0.5)
    data_fig.append(call_frequency)

# Layout
layout = go.Layout(
    title=go.layout.Title(
        text="<b>Histogram of days between calls</b>",
        xref='paper',
        x=0.5,
        y = 0.9
    )
)

# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])

# Create figure
fig = make_subplots(rows=len(customer_ids), 
                    cols=1,
                    subplot_titles=subplot_titles
                   )
for i, d in enumerate(data_fig):
    fig.append_trace(d, i+1, 1)
    fig.update_xaxes(title_text="<b>Days between Calls</b>", row=i+1, col=1)
    fig.update_yaxes(title_text="<b>Frequency</b>", row=i+1, col=1)
    
fig.update_layout(height=650)
fig.show()

Predict next call for top 3 clients

In [153]:
n_samples = 1000

# Hyper-parameters
k_folds = 10
bandwidths = 10 ** np.linspace(-1, 1, 100)
kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
metrics = ['euclidean', 'minkowski']

# Model selection for the all customers
best_kdes = {}
xs = {}
for k in dt_dict.keys():
    #Generate arrays
    xs[k] = np.linspace(0, max(dt_dict[k]), n_samples)
    xs[k] = np.reshape(xs[k], xs[k].shape + (1,))
    
    # Create Gaussian Mixture Model
    grid = GridSearchCV(KernelDensity(),
                        {'bandwidth': bandwidths, 'kernel':kernels, 'metric':metrics},
                        cv=k_folds,
                        n_jobs=-1)
    
    dt = np.reshape(dt_dict[k], dt_dict[k].shape + (1,))
    grid.fit(dt)    
    best_kdes[k] = grid.best_estimator_

# Get PDF for all customers
pdfs = {}
for k in best_kdes.keys():
    logprob = best_kdes[k].score_samples(xs[k])
    pdfs[k] = np.exp(logprob)
    pdfs[k] = pdfs[k][pdfs[k]<=1]
    
# Get each histogram 
data_figs = []
f = ff.create_distplot(list(dt_dict.values()), list(subplot_titles), show_rug=False, show_curve=False)
fig = go.FigureWidget(f)

# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])

# Create figure
fig = make_subplots(rows=len(customer_ids), 
                    cols=1,
                    subplot_titles=subplot_titles)

for i, k in enumerate(dt_dict.keys()):
    fig.add_trace(go.Histogram(x=dt_dict[k],  
                               histnorm='probability',
                               name='<b>Mixture Customer {}</b>'.format(i+1)),
                  row=i+1, 
                  col=1)
    fig.add_trace(go.Scatter(x=xs[k][:,0], 
                             y=pdfs[k], 
                             name='<b>Mixture Customer {}</b>'.format(i+1)),
                  row=i+1, 
                  col=1)
    
fig.update_layout(height=500)
fig.show()
In [156]:
range_days = 2
n_samples = 1000

# Initialize
num_days = {}
dt_possible = {}
dt_days = {}
possible_date_time = {}

# Get Interval of Days
for k in unique_dates.keys():
    num_days[k] = np.random.randint(4, 10)
    possible_date_time[k] = unique_dates[k][-1] + timedelta(num_days[k])
    dt_possible[k] = possible_date_time[k] - unique_dates[k][-1]
    dt_days[k] = dt_possible[k].days

# Initialize
pdf_samples = {}
samples = {}
proba = {}
lim_sup = {}
lim_inf = {}
min_lim = {}
max_lim = {}

# Compute probabilities for each customer
for k in dt_days.keys():
    min_lim[k] = dt_days[k] - range_days
    max_lim[k] = dt_days[k] + range_days
    samples[k] = np.linspace(min_lim[k], max_lim[k], n_samples)
    logprob = best_kdes[k].score_samples(samples[k].reshape(len(samples[k]),1))
    pdf_samples[k] = np.exp(logprob)
    
    # Remove inconsistencies
    indices = pdf_samples[k]<=1
    samples[k] = samples[k][indices]
    pdf_samples[k] = pdf_samples[k][indices]
    
    # Compute probabilities
    proba[k] = np.trapz(pdf_samples[k], samples[k])
    
    # Get interval
    lim_inf[k] = possible_date_time[k] - timedelta(range_days)
    lim_sup[k] = possible_date_time[k] + timedelta(range_days)

# Get each histogram 
data_figs = []
f = ff.create_distplot(list(dt_dict.values()), list(subplot_titles), show_rug=False, show_curve=False)
fig = go.FigureWidget(f)

# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])

# Create figure
fig = make_subplots(rows=len(customer_ids), 
                    cols=1,
                    subplot_titles=subplot_titles)

for i, k in enumerate(dt_dict.keys()):
    fig.add_trace(go.Scatter(x=xs[k][:,0], 
                             y=pdfs[k], 
                             name='<b>Mixture Customer {}</b>'.format(i+1)),
                  row=i+1, 
                  col=1)
    fig.add_scatter(x=samples[k], 
                    y=pdf_samples[k], 
                    fill='tozeroy', 
                    name='<b>Predicted Customer {}</b>'.format(i),
                    row=i+1, 
                    col=1)
    fig.add_annotation(x=min_lim[k],
                       y=max(1.1*pdfs[k]),
                       text="<b>Probability of call between {0} and {1} = {2:1.2f}</b>".format(lim_inf[k], 
                                                                                              lim_sup[k], 
                                                                                              proba[k]),
                       showarrow=False,
                       row=i+1,
                       col=1)
    
fig.update_layout(height=550)
fig.show()